#!/usr/bin/env python
# encoding=utf-8
import os
import re
from xtls.codehelper import timeit
try:
    from cStringIO import StringIO
except ImportError:
    from StringIO import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage


@timeit
def convert(fp):
    """
    convert pdf 2 text (by pdfminer)
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    device = TextConverter(rsrcmgr, retstr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp, set()):
        interpreter.process_page(page)

    text = retstr.getvalue()

    device.close()
    retstr.close()
    return re.sub(ur'[\n]+', '\n', text.decode('utf-8'))


@timeit
def convert2(path):
    """
    convert pdf 2 text (by linux command)
    """
    content = os.popen('pdftotext {path} -'.format(path=path))
    return content.read().decode('utf-8')


if __name__ == '__main__':
    # with open('/home/xlzd/abc.pdf', 'rb') as fp:
    #     fp = StringIO(fp.read())
    # # StringIO()
    # print len(convert(fp))
    import requests
    # data = requests.get('http://www.cninfo.com.cn/finalpage/2010-03-22/57717754.PDF').content
    # data = requests.get('http://www.csrc.gov.cn/pub/newsite/scb/ssgshyfljg/201605/W020160506624474072536.pdf').content
    data = requests.get('http://www.cninfo.com.cn/finalpage/2002-10-25/10036411.PDF').content

    fp = StringIO(data)
    content = convert(fp)
    print content
